--- redirect_from: - "/05clustering/clustering" interact_link: content/05Clustering/clustering.ipynb kernel_name: python3 kernel_path: content/05Clustering has_widgets: false title: |- Cluster Analysis pagenum: 4 prev_page: url: /03AssociationAnalysis/mba.html next_page: url: /04Code/Code.html suffix: .ipynb search: comment: "***PROGRAMMATICALLY GENERATED, DO NOT EDIT. SEE ORIGINAL FILES IN /content***" ---
Cluster Analysis
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import numpy as np
import plotly.express as px
import plotly.offline as py
import pandas as pd
from sklearn import decomposition
from pymongo import MongoClient
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy import stats
from sklearn.preprocessing import PowerTransformer
import numpy as np
from sklearn.manifold import Isomap
import matplotlib.pyplot as plt
from sklearn.metrics import  mean_absolute_percentage_error
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import PowerTransformer
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import plotly.graph_objects as go
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
from IPython.core.display import display, HTML
from plotly.subplots import make_subplots
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
init_notebook_mode(connected = True)
config={'showLink': False, 'displayModeBar': False}

num_att = ['First Registration',
 'Mileage',
 'Power(hp)',
 'Displacement', 'Price']

X_num_att = ['First Registration',
 'Mileage',
 'Power(hp)',
 'Displacement']

cat_att = ['Make', 'Model', 'Body', 'Fuel','Gearing Type']



def df_to_plotly(df):
    return {'z': df.values.tolist(),
            'x': df.columns.tolist(),
            'y': df.index.tolist()}


class InitalCleaning(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        play_df = play_df.drop(columns = ['ID', 'Loaded_in_DW', 'Model Code'])
        
        #Adjust Column Types
        play_df['Power(hp)'] = pd.to_numeric(play_df['Power(hp)'])
        play_df['Displacement'] = pd.to_numeric(play_df['Displacement'])
        play_df['Mileage'] = pd.to_numeric(play_df['Mileage'])
        play_df['Price'] = pd.to_numeric(play_df['Price'])
        
        #Drop rows with null values
        play_df = play_df[~(play_df['Make'].isna() | play_df['Model'].isna())]
        play_df = play_df[~(play_df['Displacement'].isna() & play_df['Power(hp)'].isna())]
        play_df = play_df[~(play_df["Body"].isna())]
        play_df = play_df[~(play_df["Fuel"].isna())]
        play_df = play_df[~(play_df["Price"].isna())]
        
        #Drop fake ads and update column values
        play_df = play_df[~(play_df['Displacement'] < 900)]
        play_df = play_df[~(play_df['Displacement'] > 70000)]
        play_df = play_df[~((play_df['Power(hp)']>500) & \
                        (~((play_df['Make'] == 'Audi') | (play_df['Make'] == 'BMW')  | (play_df['Make'] == 'Mercedes-Benz'))))]
        play_df = play_df[~((play_df['Power(hp)']<30) & (~(play_df['Fuel'] == 'Electricity')))]
        play_df = play_df[~(play_df['Price']>300000)]
        play_df["Fuel"] = play_df["Fuel"].str.split("/").str[0]
        play_df['Gearing Type'] = play_df['Gearing Type'].replace({np.nan : 'Manual'})

        
        return play_df
    
    
class AdjustEquip(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        #Keep only 0 and 1 values
        equipment = play_df.iloc[:,9:79]
        equipment = equipment.replace({np.nan: 0})
        equipment = equipment.replace({'1': 1})
        a = set(equipment['Warranty'])
        a.remove(0)
        a.remove(1)
        equipment = equipment.replace(list(a) , 1)
        
        #Convert each column's type to int
        for col in equipment.columns:
            equipment[col] = equipment[col].astype(int)
        play_df.iloc[:,9:79] = equipment

        
        return play_df
    
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        
        play_df = pd.get_dummies(data = play_df,
                                      columns = ['Make', 'Model', 'Body', 'Fuel','Gearing Type'],
                                      dummy_na = False)
        
        
        return play_df

class RegistrationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, tip = 0):
        self.tip = tip
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        
        #Reset index is used to avoid null values when merging
        play_df.reset_index(drop=True, inplace=True)
        a = date_magic(play_df['First Registration'])
        a.reset_index(drop = True, inplace = True)
        play_df['First Registration'] = a
        
        if self.tip == 0:
             play_df = play_df[(play_df['Make']=='BMW') & (play_df['First Registration'] > 2005) & ( (play_df['Model'].str.startswith('3')) | (play_df['Model'].str.startswith('1'))  ) ]
        elif self.tip == 1:
             play_df = play_df[(play_df['Make']=='BMW') & (play_df['First Registration'] > 2005)   ]

                
                 
        return play_df


class IQROutlierRemoval_new(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
         
#         Q1 = play_df.quantile(0.25)
#         Q3 = play_df.quantile(0.75)
#         IQR = Q3- Q1
#         play_df = play_df[~((play_df < (Q1 - 1.5 * IQR)) |(play_df > (Q3 + 1.5 * IQR))).any(axis=1)]
        return pd.DataFrame(play_df, columns = num_att)


class IQROutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        bmw_num = play_df[self.num_att]
        bmw_cat = play_df.drop(self.num_att, axis = 1)
        Q1 = bmw_num.quantile(0.25)
        Q3 = bmw_num.quantile(0.75)
        IQR = Q3- Q1
        bmw_num = bmw_num[~((bmw_num < (Q1 - 1.5 * IQR)) |(bmw_num > (Q3 + 1.5 * IQR))).any(axis=1)]
        return pd.DataFrame(bmw_num, columns = num_att)
    

class ZScoreOutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        z = np.abs(stats.zscore(play_df, nan_policy='omit'))
        return play_df[(z < 3).all(axis=1)]
    

class StandardScalerIndices(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self
    def transform(self, play_df, y = None):     
        return pd.DataFrame(self.scaler.transform(play_df),columns = play_df.columns, index = play_df.index)

    
class PowerTransformerIndices(BaseEstimator, TransformerMixin):
    def __init__(self, method):
        self.method = method
        self.transformer = PowerTransformer(method = self.method)
    def fit(self, X, y=None):
        self.transformer.fit(X)
        return self
    def transform(self, play_df, y = None):
        return pd.DataFrame(self.transformer.transform(play_df),columns = play_df.columns, index = play_df.index)
    
    
class RemovePrice(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        return play_df.drop('Price', axis = 1)

class IterativeImputerIndices(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.imputer = IterativeImputer(estimator = estimator)
    def fit(self, X, y=None):
        self.imputer.fit(X)
        return self
    def transform(self, play_df, y = None):
        return pd.DataFrame(self.imputer.transform(play_df),columns = play_df.columns, index = play_df.index)

class JoinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, BMW_df, new_att):
        self.BMW_df = BMW_df
        self.new_att = new_att
    def fit(self,X,y = None):
        return self
    def transform(self, X, y = None):
        new = self.BMW_df.drop(self.new_att, axis = 1)
        return X.join(new)
        

def date_magic(d):
    months_dict = {'Jan': 1,'Feb':2,'Mar':3,'Apr':4 ,'May':5 ,'Jun':6,'Jul':7,
                      'Aug':8,'Sep':9,'Oct':10 ,'Nov':11,'Dec':12}
    year = []
    month = []
    num_year = []
    for el in d:
        el = str(el)

        split = el.split('-')
        if (len(split)>1):
            month.append(months_dict[split[0]])
            if(int(split[1])>20):
                year.append(int('19' + split[1]))
            else:
                year.append(int('20' + split[1]))
            continue

        split = el.split('/')
        if(len(split)>1):
            month.append(int(split[0]))
            year.append(int(split[1]))
            continue

        month.append(1)
        year.append(int(el))
    month = pd.Series(month)
    year  = pd.Series(year)
    num_year = year + month/12  
    return pd.Series(num_year)


def readData():

    client = MongoClient('mongodb+srv://Martin:Kostadinov@dwprojectcluster.lpqbf.mongodb.net/cars_database?retryWrites=true&w=majority')

    df_cars = pd.DataFrame(list(client.cars_database.cars.find({})))
    df_cars.drop('_id', axis = 1, inplace = True)
    df_cars = df_cars[df_cars['Loaded_in_DW'].eq(False)]
    df_cars.drop_duplicates(subset=['ID'], inplace = True)
    
    return df_cars




recnik = {}
recnik['Method'] = []
recnik['Mean Percentage Error'] = []
recnik['Standard Deviation'] = []
def randomforestCV( max_features, n_estimators, X, y, message, scoring = 'neg_mean_absolute_percentage_error', recnik = recnik):
    rfr = RandomForestRegressor(max_features = max_features, n_estimators = n_estimators, random_state = 2)
    rfr.fit(X,y)
    rfr_scores = cross_val_score(rfr, X, y, scoring = scoring, cv = 5, n_jobs = -1)
    print("Scores:", -rfr_scores)
    print("Mean:", -rfr_scores.mean())
    print("Standard deviation:", rfr_scores.std())
    recnik['Method'].append(message)
    recnik['Mean Percentage Error'].append(-rfr_scores.mean())
    recnik['Standard Deviation'].append(rfr_scores.std())

    
def gridSearch(param_grid, model, X, y):
    grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='neg_mean_absolute_percentage_error',
                           return_train_score=True,
                           verbose = 10, n_jobs = -1)
    grid_search.fit(X, y)
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(-mean_score, params)
    return grid_search
df_cars = readData()
type0_pipeline = Pipeline([
                        ('initial', InitalCleaning()),
                        ('equipment', AdjustEquip()),
                        ('date', RegistrationTransformer(tip = 0))
                    ])
BMW_df = type0_pipeline.fit_transform(df_cars)
clustering = BMW_df[num_att]

final_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        
                        ])
ready = final_pipeline.fit_transform(clustering)
E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:198: RuntimeWarning:

invalid value encountered in less

kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(ready)
                for k in range(2, 10)]



silhouette_scores = [silhouette_score(ready, model.labels_)
                     for model in kmeans_per_k[0:]]
fig = go.Figure()
fig.add_trace(go.Scatter(y = silhouette_scores,  name = 'Scores'))

fig.update_layout(plot_bgcolor='rgb(255,255,255)',xaxis_title="k",
    yaxis_title='Silhouette Scores')
fig.update_xaxes(ticks = 'outside', showline=True, linecolor='black')
fig.update_yaxes(ticks = 'outside', showline=True, linecolor='black')



plot(fig, filename = 'fig.html', config = config)
display(HTML('fig.html'))
labels = kmeans_per_k[3].labels_
model = DecisionTreeClassifier()
# fit the model
model.fit(ready, labels)

fig2 = px.bar(y = model.feature_importances_)

fig2.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1, 2, 3, 4],
        ticktext = ['First Registration', 'Mileage', 'Power(hp)', 'Displacement','Price']
    )
)


fig2.update_layout(plot_bgcolor='rgb(255,255,255)',xaxis_title="Feature Name",
    yaxis_title='Importance')


plot(fig2, filename = 'fig2.html', config = config)
display(HTML('fig2.html'))
ready['Labels'] = labels
corr2 = df_to_plotly(ready.corr())
fig3 = px.imshow(corr2['z'], x = corr2['x'], y = corr2['y'], color_continuous_scale=px.colors.sequential.RdBu)

plot(fig3, filename = 'fig3.html', config = config)
display(HTML('fig3.html'))
lab_id = ready['Labels']
joinn = BMW_df.join(lab_id)
joinn = joinn[~(joinn['Labels'].isna())]
join = joinn[num_att]
trace1 = go.Scatter3d(
    x= join['Power(hp)'],
    y= join['Displacement'],
    z= join['Price'],
    mode='markers',
     marker=dict(
        color = labels, 
        size= 10,
        line=dict(
            color= labels,
            width= 12
        ),
        opacity=0.8
     )
)


layout = go.Layout(
    title = 'Clusters',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0  
    ),
    scene = dict(
            xaxis = dict(title  = 'Power(hp)'),
            yaxis = dict(title  = 'Displacement'),
            zaxis = dict(title  = 'Price')
        )
)

fig4 = go.Figure(data = trace1, layout = layout)

plot(fig4, filename = 'fig4.html', config = config)
display(HTML('fig4.html'))